#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr)
suppressWarnings(suppressMessages(library(WGCNA)))

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_21_WGCNA.Rmd)

# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/Gandal/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))

clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]

dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]


rm(DE_info, GO_annotations, clusterings)

Relation to external clinical traits

Quantifying module-trait associations

Using the hetcor function, that calculates Pearson, polyserial or polychoric correlations depending on the type of variables involved.

datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
            rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}

rm(ME_object)

I’m going to select all the modules that have an absolute correlation higher than 0.9 with Diagnosis to study them

moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]

# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
               main = paste('Module-Trait relationships'))

diagnosis_cor = data.frame('Module' = gsub('ME','',rownames(moduleTraitCor)),
                           'MTcor' = moduleTraitCor[,'Diagnosis'],
                           'MTpval' = moduleTraitPvalue[,'Diagnosis'])

genes_info = genes_info %>% left_join(diagnosis_cor, by='Module')

rm(moduleTraitPvalue, datTraits, textMatrix, diagnosis_cor)

Studying the modules with the highest absolute correlation to Diagnosis

I was expecting the modules to consist mainly of genes with the highest LFC (largest absolute values in PC2), and they are large, but many of the largest genes are not in these modules.

top_modules = gsub('ME','',rownames(moduleTraitCor)[abs(moduleTraitCor[,'Diagnosis'])>0.9])

cat(paste0('Top modules selected: ', paste(top_modules, collapse=', '),'\n'))
## Top modules selected: #9B8EFF, #F066EA, #8DAB00
pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4))

table(plot_data$ImportantModules)
## 
## #8DAB00 #9B8EFF #F066EA  Others 
##    1277     215    1158   13950
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=ID)) + theme_minimal() + 
           ggtitle('Modules with strongest relation to Diagnosis'))
rm(pca)

Module Membership vs Gene Significance

create_plot = function(module){
  
  plot_data = dataset %>% dplyr::select(ID, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% filter(dataset$Module==module)
  colnames(plot_data)[2] = 'Module'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='None'])))
  
  p = ggplotly(plot_data %>% ggplot(aes(Module, GS, color=gene.score)) + geom_point(alpha=0.5, aes(ID=ID)) + ylab('|Gene Significance|') +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,8))) + theme_minimal() + xlab('Module Membership') +
               ggtitle(paste0('Module ', module,'  (MTcor = ', round(moduleTraitCor[paste0('ME',module),1],2),')')))
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)

Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but the plot shows this relation in modules with correlations close to 0.

plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>% 
            group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
            mutate(p=round(p/n*100,2)) 

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(this_row$hasSFARIscore==FALSE & this_row$p==100){
    new_row = this_row
    new_row$hasSFARIscore = TRUE
    new_row$p = 0
    plot_data = plot_data %>% rbind(new_row)
  }
}

plot_data = plot_data %>% filter(hasSFARIscore==TRUE)

ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
         geom_point(color=plot_data$Module, aes(id=Module)) +
         xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
         theme_minimal() + theme(legend.position = 'none'))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
rm(i, this_row, new_row, plot_data)

Module Eigengenes

Since these modules have the strongest relation to autism, this pattern should be reflected in their model eigengenes, having two different behaviours for the samples corresponding to autism and the ones corresponding to control.

In all three cases, the Eigengenes separate the behaviour between autism and control samples very clearly!

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME',module)], 'Diagnosis' = datMeta$Diagnosis_)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + geom_boxplot() + theme_minimal() + theme(legend.position='none') +
                    ggtitle(paste0('Module ', module, '  (MTcor=',round(moduleTraitCor[paste0('ME',module),1],2),')'))
  return(p)
}

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])

grid.arrange(p1, p2, p3, nrow=1)

rm(plot_EGs, p1, p2, p3)

Identifying important genes

Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance

*Ordered by \(\frac{MM+|GS|}{2}\)

Only two SFARI Genes in the three lists, none from scores 1 and 2…

create_table = function(module){
  top_genes = dataset %>% dplyr::select(ID, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% filter(dataset$Module==module) %>%
              rename('MM' = paste0('MM.',gsub('#','',module))) %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
              top_n(10)
  return(top_genes)
}

top_genes_1 = create_table(top_modules[1])
kable(top_genes_1, caption=paste0('Top 10 genes for module ', top_modules[1], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[1]),1],2),')'))
Top 10 genes for module #9B8EFF (MTcor = 0.98)
ID MM GS gene.score importance
ENSG00000143384 0.8768314 0.9060349 None 0.8914332
ENSG00000161638 0.8613780 0.8646717 None 0.8630249
ENSG00000183864 0.7916638 0.8719542 None 0.8318090
ENSG00000196935 0.7680729 0.8951553 None 0.8316141
ENSG00000150907 0.7883168 0.8458336 None 0.8170752
ENSG00000148841 0.8218756 0.8042687 None 0.8130721
ENSG00000150457 0.7719291 0.8433350 None 0.8076321
ENSG00000101493 0.7243000 0.8764275 None 0.8003638
ENSG00000148498 0.6952386 0.8949607 None 0.7950996
ENSG00000120278 0.7711860 0.8073816 None 0.7892838
top_genes_2 = create_table(top_modules[2])
kable(top_genes_2, caption=paste0('Top 10 genes for module ', top_modules[2], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[2]),1],2),')'))
Top 10 genes for module #F066EA (MTcor = 0.91)
ID MM GS gene.score importance
ENSG00000180573 0.8413858 0.8648084 None 0.8530971
ENSG00000181722 0.8640272 0.8406765 3 0.8523519
ENSG00000003402 0.8807946 0.8010251 None 0.8409098
ENSG00000184678 0.7903735 0.8878029 None 0.8390882
ENSG00000120690 0.7921926 0.7836121 None 0.7879024
ENSG00000107104 0.8162030 0.7554489 4 0.7858260
ENSG00000084093 0.8091856 0.7472778 None 0.7782317
ENSG00000198879 0.8039142 0.7457312 None 0.7748227
ENSG00000198185 0.7958118 0.7482581 None 0.7720349
ENSG00000051620 0.7890904 0.7462823 None 0.7676863
top_genes_3 = create_table(top_modules[3])
kable(top_genes_3, caption=paste0('Top 10 genes for module ', top_modules[3],'  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[3]),1],2),')'))
Top 10 genes for module #8DAB00 (MTcor = -0.92)
ID MM GS gene.score importance
ENSG00000050748 0.9014380 -0.9399810 None 0.9207095
ENSG00000108395 0.9067664 -0.9250494 None 0.9159079
ENSG00000138078 0.8923376 -0.8966280 None 0.8944828
ENSG00000177432 0.8590945 -0.8915541 None 0.8753243
ENSG00000163577 0.8377986 -0.9116043 None 0.8747015
ENSG00000176490 0.8551447 -0.8936753 None 0.8744100
ENSG00000155097 0.8548700 -0.8885175 None 0.8716938
ENSG00000128881 0.8740293 -0.8654230 None 0.8697261
ENSG00000171132 0.8345108 -0.9028094 None 0.8686601
ENSG00000114573 0.8277575 -0.9068579 None 0.8673077
rm(create_table)
pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), 'gray')) %>%
            mutate(alpha = ifelse(color %in% top_modules & 
                                  ID %in% c(as.character(top_genes_1$ID), 
                                            as.character(top_genes_2$ID),
                                            as.character(top_genes_3$ID)), 1, 0.1))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              theme_minimal() + ggtitle('Important genes identified through WGCNA')



Session info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] WGCNA_1.68            fastcluster_1.1.25    dynamicTreeCut_1.63-1
##  [4] knitr_1.24            doParallel_1.0.15     iterators_1.0.12     
##  [7] foreach_1.4.7         polycor_0.7-10        expss_0.10.1         
## [10] GGally_1.4.0          gridExtra_2.3         viridis_0.5.1        
## [13] viridisLite_0.3.0     RColorBrewer_1.1-2    dendextend_1.13.2    
## [16] plotly_4.9.1          glue_1.3.1            reshape2_1.4.3       
## [19] forcats_0.4.0         stringr_1.4.0         dplyr_0.8.3          
## [22] purrr_0.3.3           readr_1.3.1           tidyr_1.0.0          
## [25] tibble_2.1.3          ggplot2_3.2.1         tidyverse_1.3.0      
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.2-0                 plyr_1.8.5                 
##   [5] lazyeval_0.2.2              splines_3.6.0              
##   [7] crosstalk_1.0.0             BiocParallel_1.20.1        
##   [9] GenomeInfoDb_1.22.0         robust_0.4-18.2            
##  [11] digest_0.6.23               htmltools_0.4.0            
##  [13] GO.db_3.10.0                fansi_0.4.1                
##  [15] magrittr_1.5                checkmate_1.9.4            
##  [17] memoise_1.1.0               fit.models_0.5-14          
##  [19] cluster_2.0.8               annotate_1.64.0            
##  [21] modelr_0.1.5                matrixStats_0.55.0         
##  [23] colorspace_1.4-1            blob_1.2.0                 
##  [25] rvest_0.3.5                 rrcov_1.4-7                
##  [27] haven_2.2.0                 xfun_0.8                   
##  [29] crayon_1.3.4                RCurl_1.95-4.12            
##  [31] jsonlite_1.6                genefilter_1.68.0          
##  [33] zeallot_0.1.0               impute_1.60.0              
##  [35] survival_2.44-1.1           gtable_0.3.0               
##  [37] zlibbioc_1.32.0             XVector_0.26.0             
##  [39] DelayedArray_0.12.2         BiocGenerics_0.32.0        
##  [41] DEoptimR_1.0-8              scales_1.1.0               
##  [43] mvtnorm_1.0-11              DBI_1.1.0                  
##  [45] Rcpp_1.0.3                  xtable_1.8-4               
##  [47] htmlTable_1.13.1            foreign_0.8-71             
##  [49] bit_1.1-15.1                preprocessCore_1.48.0      
##  [51] Formula_1.2-3               stats4_3.6.0               
##  [53] htmlwidgets_1.5.1           httr_1.4.1                 
##  [55] acepack_1.4.1               farver_2.0.3               
##  [57] pkgconfig_2.0.3             reshape_0.8.8              
##  [59] XML_3.98-1.20               nnet_7.3-12                
##  [61] dbplyr_1.4.2                locfit_1.5-9.1             
##  [63] later_1.0.0                 tidyselect_0.2.5           
##  [65] labeling_0.3                rlang_0.4.2                
##  [67] AnnotationDbi_1.48.0        munsell_0.5.0              
##  [69] cellranger_1.1.0            tools_3.6.0                
##  [71] cli_2.0.1                   generics_0.0.2             
##  [73] RSQLite_2.2.0               broom_0.5.3                
##  [75] fastmap_1.0.1               evaluate_0.14              
##  [77] yaml_2.2.0                  bit64_0.9-7                
##  [79] fs_1.3.1                    robustbase_0.93-5          
##  [81] nlme_3.1-139                mime_0.8                   
##  [83] xml2_1.2.2                  compiler_3.6.0             
##  [85] rstudioapi_0.10             reprex_0.3.0               
##  [87] geneplotter_1.64.0          pcaPP_1.9-73               
##  [89] stringi_1.4.5               highr_0.8                  
##  [91] lattice_0.20-38             Matrix_1.2-17              
##  [93] vctrs_0.2.1                 pillar_1.4.3               
##  [95] lifecycle_0.1.0             data.table_1.12.8          
##  [97] bitops_1.0-6                httpuv_1.5.2               
##  [99] GenomicRanges_1.38.0        R6_2.4.1                   
## [101] latticeExtra_0.6-28         promises_1.1.0             
## [103] IRanges_2.20.2              codetools_0.2-16           
## [105] MASS_7.3-51.4               assertthat_0.2.1           
## [107] SummarizedExperiment_1.16.1 DESeq2_1.26.0              
## [109] withr_2.1.2                 S4Vectors_0.24.2           
## [111] GenomeInfoDbData_1.2.2      hms_0.5.3                  
## [113] grid_3.6.0                  rpart_4.1-15               
## [115] rmarkdown_1.14              Cairo_1.5-10               
## [117] shiny_1.4.0                 Biobase_2.46.0             
## [119] lubridate_1.7.4             base64enc_0.1-3